Project
! pip install plotly_express
! pip install geopandas
#Importing the necessary libraries
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import os
import pandas as pd
import seaborn as sns
import math
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from pandas.plotting import scatter_matrix
from geopy.distance import vincenty
from geopy.geocoders import Nominatim
import folium
from folium import Marker
import geopandas as gpd
import plotly_express as px
from xgboost.sklearn import XGBRegressor
import gc
from sklearn.pipeline import Pipeline
from scipy.stats import zscore
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score,RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
sns.set(rc={'figure.figsize':(11,8)})
'''Suppress Scientific Notation '''
pd.options.display.float_format = '{:.2f}'.format
from google.colab import drive
drive.mount('/content/drive')
#reading the data as a data frame
path = "/content/drive/MyDrive/ML Capstone Project - 2022/Data/innercity.csv"
data_city=pd. read_csv(path)
print (data_city.head())
print (data_city.shape)
print(data_city.info())
data_city.describe().transpose()
data_city.isnull().any()
''' Quick confirmation along with the above one'''
data_city.isna().sum()
data_city.duplicated().sum()
There are no nulls or duplicates present in in the data frame. Now lets check on the correlation features
data_city.corr()
plt.figure(figsize= (18,15))
sns.heatmap(data_city.corr())
#plotting a heatmap for the correlation
Univariate Analysis with Box Plot to see if there are any identifiers
# let's boxplot all the numerical columns and see if there any outliers
for i in data_city.columns:
try:
data_city.iloc[:, 1:].boxplot(column=i)
plt.show()
except Exception as e :
print (e)
#print(data_city.columns)
columns = (data_city.select_dtypes(include=np.number).columns.tolist())
''' Put them as table'''
for col in data_city.columns:
hist = data_city[col].hist(bins=10)
print("Distribution Plot for {0}".format(col))
plt.show()
Pair plot: Matrix of scatterplots that lets you understand the pairwise relationship between different variables in a dataset.
plt.figure(figsize= (75,50))
sns.pairplot(data_city)
''' Divide into multiple columns'''
print(pd.isnull(data_city).any())
plt.figure(figsize = (10,8))
sns.distplot(data_city['price'], kde=False, bins=8)
Most of the housing oprices are being quoted at 1 million and goes upto 3 million.
sns.lineplot(x='living_measure', y='price', data=data_city)
There is a clear correlation between the living area of the house and the price. If the area is too big, the price starts to decrease as there might be few buyers for the big house which are costly to maintain.
Following are few observations which can be seen how the prices are corrlelated to various features.
sns.lineplot(x='yr_built', y='price', data=data_city)
sns.lineplot(x='quality', y='price', data=data_city)
sns.lineplot(x='room_bed', y='price', data=data_city)
THere can be seen that there is clear increasing trend with room bed , but some outliers are present though.
sns.lineplot(x='room_bath', y='price', data=data_city)
There is a clear upward trend in the price with increase in the room_bath.
sns.lineplot(x='condition', y='price', data=data_city)
sns.lineplot(x="furnished", y="price", data=data_city);
plt.figure(figsize=(15,10))
print(sns.scatterplot(data_city['living_measure'],data_city['price'], hue = data_city['furnished']))
It can be seen that there is a clear increase in the living measure with respect to the price and furnished with outliers.
#lot_measure >100000 - price increases with increase in living measure
plt.figure(figsize=(20, 15))
print(sns.scatterplot(data_city['lot_measure'],data_city['price']))
There are not much of correlation between the lot measure and price.
print(sns.scatterplot(data_city['ceil'],data_city['price']))
print(sns.lineplot(data_city['coast'],data_city['price']))
Houses with water_front tend to have higher price compared to that of non-water_front properties.
renovated = data_city[data_city['yr_renovated'] > 0]
print(sns.scatterplot(renovated['yr_renovated'],data_city['price']))
Many houses are renovated after 1975's tend to have higher price.
print(sns.scatterplot(data_city['lot_measure'],data_city['price'],hue=data_city['furnished']))
Furnished houses have higher price than that of the Non-furnished houses.
sns.scatterplot(data_city['total_area'],data_city['price'],hue=data_city['furnished'])
Total area doesnt have any direct correlation with the price. Can be omitted while doing feature selection.
'''
Making only the year of the sale to be in place.
'''
data_city["Year"] = data_city["dayhours"].apply(lambda x:x[0:4])
data_city['Year'] = pd.to_numeric(data_city['Year'])
print (data_city.head())
data_city.columns
'''Remove the unwanted columns'''
data_city.drop( columns = ['cid','dayhours'],inplace = True)
Doesnt have any good correlation betwen the year
data_city.head(5)
print(data_city.info())
img=mpimg.imread('/content/drive/MyDrive/ML Capstone Project - 2022/Data/lake-forest-park-wa-5337270.gif')
data_city.plot(kind='scatter', x='long', y='lat', alpha=0.9,
label='population', figsize=(25,20),c='price',s=data_city['living_measure']/100.0,cmap=plt.get_cmap('jet'),
colorbar=True)
plt.imshow(img, extent=[-122.52, -121.31, 47.16, 47.78], alpha=0.2)
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)
plt.legend()
plt.show()
Below mentions the map from the latitude and longitue and mentions the price of the particular area when hovered around the particular point.
fig = px.scatter_mapbox(data_city, lat="lat", lon="long",
hover_data = ['price'],
color_discrete_sequence=["blue", "green"],
zoom=13,
height=800,
title = 'Inner City Price Map')
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
Set up the X and Y from dataset
#splitting the data into X and Y
x = data_city.drop("price", axis=1)
y = data_city["price"]
print(x.shape)
print(y.shape)
As discussed following regression models can be used to evaluate the models.
#splitting the data in test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 547)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print (x_train['Year'].unique())
print(x_train.info())
print (x_test['Year'].unique())
print(x_test.info())
#creating the basic models
lin_regression = LinearRegression()
lasso = Lasso()
ridge = Ridge()
randomForest = RandomForestRegressor()
knn = KNeighborsRegressor()
xgb = XGBRegressor()
gb=GradientBoostingRegressor( learning_rate = 0.1)
reg_models = [lin_regression,lasso,ridge,randomForest,knn,xgb,gb]
print(reg_models)
#generating the scores
model_output_phase1 = {}
for model in reg_models:
mod_result = {}
try:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
mod_result['Train Accuracy'] = model.score(x_train, y_train) * 100
mod_result['Test Accuracy'] = model.score(x_test, y_test) * 100
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae =mean_absolute_error(y_test, y_pred)
mod_result['RMSE'] = np.sqrt(rmse)
mod_result['Mean Absolute Error'] = mae
mod_result['Predicted'] = y_pred
model_output_phase1[str(model)] = mod_result
print (str(model))
except Exception as e:
print ('Exception for Model {0} : {1}'.format(str(model),e))
y_pred = None
Above method returns the value of the listed distributions from the list and the results as dictionary. FOllowing are the attributes in the results
for key , values in model_output_phase1.items():
if key == 'Predicted':
continue
print()
print(key)
print(values)
Regression Distributions of Actual Test vs Predicted (Joint plot)
Linear Regression
sns.jointplot(x=y_test, y=model_output_phase1['LinearRegression()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['LinearRegression()'].get('Test Accuracy'),model_output_phase1['LinearRegression()'].get('RMSE')))
Ridge
sns.jointplot(x=y_test, y=model_output_phase1['Ridge()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['Ridge()'].get('Test Accuracy'),model_output_phase1['Ridge()'].get('RMSE')))
Lasso
sns.jointplot(x=y_test, y=model_output_phase1['Lasso()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['Lasso()'].get('Test Accuracy'),model_output_phase1['Lasso()'].get('RMSE')))
Random Forest
sns.jointplot(x=y_test, y=model_output_phase1['RandomForestRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['RandomForestRegressor()'].get('Test Accuracy'),model_output_phase1['RandomForestRegressor()'].get('RMSE')))
KNN
sns.jointplot(x=y_test, y=model_output_phase1['KNeighborsRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['KNeighborsRegressor()'].get('Test Accuracy'),model_output_phase1['KNeighborsRegressor()'].get('RMSE')))
XGBoost
sns.jointplot(x=y_test, y=model_output_phase1['XGBRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['XGBRegressor()'].get('Test Accuracy'),model_output_phase1['XGBRegressor()'].get('RMSE')))
Gradient Boosting
sns.jointplot(x=y_test, y=model_output_phase1['GradientBoostingRegressor()'].get('Predicted'), kind="reg", color="m")
print ('Test Accuracy : {0} , RMSE : {1}'.format(model_output_phase1['GradientBoostingRegressor()'].get('Test Accuracy'),model_output_phase1['GradientBoostingRegressor()'].get('RMSE')))
From the above graphs and the result, it can be inferred that the following models perform for the regression metrics compared to the other regression models used.
We will use the above models with hyper parameters, PCA, feature selection to score a better result.
# Transform the X variable to zscores and create the PCA dimensions.
data_pca = data_city.drop(['price'], axis = 1)
print (data_pca.info())
x_scaled = data_pca.apply(zscore)
x_scaled.head()
#scaled data
covMatrix = np.cov(x_scaled,rowvar=False)
print(covMatrix)
pca = PCA()
pca.fit(x_scaled)
print ('Eigen Values')
print (pca.explained_variance_)
print('Eigen Vectors')
print(pca.explained_variance_ratio_)
print(len(pca.explained_variance_ratio_))
tot = sum(pca.explained_variance_)
var_explained = [(i / tot) for i in sorted(pca.explained_variance_, reverse=True)]
# an array of variance explained by each
# eigen vector... there will be 90 entries as there are 90 eigen vectors)
cum_var_exp = np.cumsum(var_explained)
print(len(var_explained))
print(len(cum_var_exp))
print(cum_var_exp)
It can be inferred that 85% of th variance are contributed by 18 features
Ensemble Feature Selection
Feature Selection for Random Forest
plt.figure(figsize=(15, 10))
plt.bar(range(0,21), var_explained, alpha = 0.5, align='center', label='individual explained variance')
plt.step(range(0,21), cum_var_exp, label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
#feature importance
rf_feature=pd.DataFrame(randomForest.feature_importances_, columns = ["Imp"], index = x_train.columns)
rf_feature.sort_values(by="Imp",ascending=False)
rf_feature['Imp'] = rf_feature['Imp'].map('{0:.5f}'.format)
rf_feature=rf_feature.sort_values(by="Imp",ascending=False)
rf_feature.Imp=rf_feature.Imp.astype("float")
rf_feature[:21].plot.bar(figsize=(15, 10))
print (rf_feature[:21])
print("First 7 feature importance:\t",(rf_feature[:7].sum())*100)
print("First 10 feature importance:\t",(rf_feature[:10].sum())*100)
print("First 12 feature importance:\t",(rf_feature[:12].sum())*100)
print("First 13 feature importance:\t",(rf_feature[:13].sum())*100)
print("First 15 feature importance:\t",(rf_feature[:15].sum())*100)
print("First 20 feature importance:\t",(rf_feature[:20].sum())*100)
Feature Selection for XGB
#feature importance for XGB
xgb_feature=pd.DataFrame(xgb.feature_importances_, columns = ["Imp"], index = x_train.columns)
xgb_feature.sort_values(by="Imp",ascending=False)
xgb_feature['Imp'] = xgb_feature['Imp'].map('{0:.5f}'.format)
xgb_feature=xgb_feature.sort_values(by="Imp",ascending=False)
xgb_feature.Imp=xgb_feature.Imp.astype("float")
xgb_feature[:21].plot.bar(figsize=(15, 10))
print (xgb_feature[:21])
#First 20 features have an importance of 90.5% and first 30 have importance of 95.15
print("First 7 feature importance:\t",(xgb_feature[:7].sum())*100)
print("First 10 feature importance:\t",(xgb_feature[:10].sum())*100)
print("First 12 feature importance:\t",(xgb_feature[:12].sum())*100)
print("First 13 feature importance:\t",(xgb_feature[:13].sum())*100)
print("First 15 feature importance:\t",(xgb_feature[:15].sum())*100)
print("First 20 feature importance:\t",(xgb_feature[:20].sum())*100)
From above we could see that the first 12 features in both Random Forest and XGB equates to 96%.
XGB - 95.94
Random Forest: 96.02
feature_xgb = [ 'quality','living_measure','coast', 'sight', 'long', 'lat', 'yr_built',
'living_measure15','ceil_measure','zipcode', 'room_bath','yr_renovated']
feature_rf = ['living_measure','quality','lat','long','furnished','living_measure15','coast','yr_built','ceil_measure',
'zipcode','sight','lot_measure15']
features = set(feature_xgb).union(feature_rf)
features = list(features)
features.append('price')
print(features)
data_pca = data_city[features]
print (data_pca.head())
With the reduced features, we will proceed with the Grid Search operation for feature importance
xgs = data_pca.drop("price" , axis=1)
ygs = data_pca["price"]
#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
print(xgs_train.shape)
print(xgs_test.shape)
print(ygs_train.shape)
print(ygs_test.shape)
Lets Create a new Dataframe to hold all the results which are needed. This is just a place holder. Initializing a dataframe simply and then adding them one by one is a costly affair in terms of memory. We will create them on the go with the below variable.
pd_reg_search_results = None
Hyper Tuning the paramters for Random Forest
RF_ht = RandomForestRegressor()
params_rf = {"n_estimators": np.arange(70,90,1),"max_depth": np.arange(12,20,1),
"max_features":np.arange(5,10,1),'min_samples_leaf': range(4, 8, 1),
'min_samples_split': range(16, 20, 1)}
rf_random = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(xgs_train,ygs_train)
print (rf_random.best_params_)
print (rf_random.best_score_)
params_rf = {'min_samples_leaf': range(3, 9, 1),
}
rf_random = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(xgs_train,ygs_train)
print (rf_random.best_params_)
print (rf_random.best_score_)
params_rf = {"max_depth": np.arange(12,20,1),
}
rf_random2 = RandomizedSearchCV(estimator = RF_ht, param_distributions = params_rf,
n_iter = 5, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random2.fit(xgs_train,ygs_train)
print (rf_random2.best_params_)
print (rf_random2.best_score_)
'''
Can we make this as a separate function as this will be repeated in further models. Added here for better readability.
Based on Tomorrow's discussion we can modify and fine tune again for better readability
'''
def calc_RandomForestRegressor(x_train, x_test, y_train, y_test,n_estimators= 0, min_samples_split= 0,
min_samples_leaf= 0, max_features= 0, max_depth= 0, name = ''):
rfg = RandomForestRegressor(n_estimators= n_estimators, min_samples_split= min_samples_split, min_samples_leaf= min_samples_leaf,
max_features= max_features, max_depth= max_depth)
'''
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
'''
rfg.fit(x_train, y_train)
y_pred_rfg = rfg.predict(x_test)
train_score = rfg.score(x_train, y_train) * 100
test_score = rfg.score(x_test, y_test) * 100
rmse = mean_squared_error(y_test, y_pred_rfg, squared=False)
mae =mean_absolute_error(y_test, y_pred_rfg)
rmse_val = np.sqrt(rmse)
r2_score_val = r2_score(y_test, y_pred_rfg)
r2_score_variance_weighted = r2_score(y_test, y_pred_rfg, multioutput='variance_weighted')
print('Train Score: {0}%'.format(train_score))
print('Test Score: {0}%'.format(test_score))
print('RMSE:{0}'.format(rmse_val))
print( 'Mean Absolute Error:{0}'.format(mae))
print ('R Square: {0}'.format(r2_score_val))
print ('R Square Variance Weighted : {0}'.format(r2_score_val))
return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
'Y Pred':y_pred_rfg}
rf_result_random = calc_RandomForestRegressor(xgs_train, xgs_test, ygs_train, ygs_test,
n_estimators= 83, min_samples_split=19, min_samples_leaf= 4,
max_features= 8, max_depth= 15, name = 'RF Train')
print(rf_result_random)
Creating the result Dataframe to add the rows
pd_reg_search_results = pd.DataFrame({'Method':rf_result_random.get('Method') ,
'Name':'Random Forest (estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
'Train Score (%)':rf_result_random.get('Train Square'),
'Test Score (%)':rf_result_random.get('Test Score'),
'R Square':rf_result_random.get('R Square'),
},index = [0]
)
pd_reg_search_results
print (rf_feature[:21])
print("First 7 feature importance:\t",(rf_feature[:7].sum())*100)
print("First 10 feature importance:\t",(rf_feature[:10].sum())*100)
print("First 12 feature importance:\t",(rf_feature[:12].sum())*100)
Taking only the top 10 columns to see if that helps in the performance of the Random Forest REgressor. Also the columns have very less weightage.
data_pca1 = data_city[features]
data_pca1.drop(["sight","lot_measure15"] ,inplace = True,axis = 1)
print(data_pca1.info())
xgs1 = data_pca1.drop("price" , axis=1)
ygs1 = data_pca1["price"]
#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train1, xgs_test1, ygs_train1, ygs_test1 = train_test_split(xgs1, ygs1, test_size=0.2, random_state=10)
print(xgs_train1.shape)
print(xgs_test1.shape)
print(ygs_train1.shape)
print(ygs_test1.shape)
rf_res_reduced_train = calc_RandomForestRegressor(xgs_train1, xgs_test1, ygs_train1, ygs_test1,n_estimators= 83,
min_samples_split=19, min_samples_leaf= 4, max_features= 8,
max_depth= 15, name = 'RF Train2')
pd_reg_search_results
df2 = {'Method':rf_res_reduced_train.get('Method') ,
'Name':'Features Removed Random Forest (estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
'Train Score (%)':rf_result_random.get('Train Square'),
'Test Score (%)':rf_res_reduced_train.get('Test Score'),
'R Square':rf_res_reduced_train.get('R Square'),
}
pd_reg_search_results = pd_reg_search_results.append(df2,ignore_index=True, sort = False)
From the reduced train it can be inferred that the columns living measure, lot measure and ceil measure are multi colliear and related to each other. Lets try dropping the repeated columns and see the improvement from 87%.
Following are the columns which can be dropped.
data_pca2 = data_pca1
data_pca2.drop(["living_measure15","ceil_measure"] ,inplace = True,axis = 1)
print(data_pca2.info())
xgs2 = data_pca2.drop("price" , axis=1)
ygs2 = data_pca2["price"]
#xgs_train, xgs_test, ygs_train, ygs_test = train_test_split(xgs, ygs, test_size=0.2, random_state=10)
xgs_train2, xgs_test2, ygs_train2, ygs_test2 = train_test_split(xgs2, ygs2, test_size=0.2, random_state=10)
print(xgs_train2.shape)
print(xgs_test2.shape)
print(ygs_train2.shape)
print(ygs_test2.shape)
rf_res_reduced_train1 = calc_RandomForestRegressor(xgs_train2, xgs_test2, ygs_train2, ygs_test2,n_estimators= 83,
min_samples_split=19, min_samples_leaf= 3, max_features= 8,
max_depth= 15, name = 'RF Train3')
df3 = {'Method':rf_res_reduced_train1.get('Method') ,
'Train Score (%)':rf_res_reduced_train1.get('Train Square'),
'Name':'Multi collinear removed frmo Random Forest(estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)',
'Test Score (%)':rf_res_reduced_train1.get('Test Score'),
'R Square':rf_res_reduced_train1.get('R Square')
}
pd_reg_search_results = pd_reg_search_results.append(df3,ignore_index=True, sort = False)
pd_reg_search_results.iloc[0,1]
From the above , we tried Random Forest HP turning and removed some columns post tuning as well. The Best Score we could gain here are the following Train Score: 91.98782014869201% Test Score: 87.1806477469115% RMSE:357.8097448779679
XGB Regressor
xgb = XGBRegressor(learning_rate=0.02, n_estimators=600,
silent=False, nthread=1)
params_xgb = {
'gamma': [0.1,0.5, 1, 1.5, 2, 5],
'max_depth': [2,3, 4, 5],
'max_features':[8,9,10,11],
'min_samples_leaf':[6,7,8],
"n_estimators": [50,75,100]
}
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb,
n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
xgb_random.fit(xgs_train,ygs_train)
print (xgb_random.best_params_)
print (xgb_random.best_score_)
def calc_xgb(x_train, x_test, y_train, y_test,learning_rate=0, n_estimators=0,silent=False, nthread=0,
min_samples_leaf = 0, max_features = 0, max_depth = 0, gamma = 0, booster = 'gbtree', name = ''):
xgb = XGBRegressor(learning_rate=learning_rate, n_estimators=n_estimators,silent=silent, nthread=nthread,min_samples_leaf = min_samples_leaf,
max_features = max_features, max_depth = max_depth, gamma = gamma, booster = booster)
'''
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
'''
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
train_score = xgb.score(x_train, y_train) * 100
test_score = xgb.score(x_test, y_test) * 100
rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
rmse_val = np.sqrt(rmse)
mae =mean_absolute_error(y_test, y_pred_xgb)
r2_score_val = r2_score(y_test, y_pred_xgb)
r2_score_variance_weighted = r2_score(y_test, y_pred_xgb, multioutput='variance_weighted')
print('Train Score: {0}%'.format(train_score))
print('Test Score: {0}%'.format( test_score))
print('RMSE:{0}'.format(rmse_val))
print('MAE:{0}'.format(mae))
print('R Square : {0}'.format(r2_score_val)* 100)
print('R Square Variance Weighted :{0}'.format(r2_score_variance_weighted)*100)
return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
'Y Pred':y_pred_xgb}
xgb_rand_res = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.02, n_estimators=100,silent=False, nthread=0,
min_samples_leaf = 8, max_features = 11, max_depth = 5, gamma = 0.5, name = 'XGB Random')
df4 = {'Method':xgb_rand_res.get('Method') ,
'Name':'XG Boost (estimators:100, Learning Rate:0.02, max_features:11,min_samples_leaf:8)',
'Train Score (%)':xgb_rand_res.get('Train Square'),
'Test Score (%)':xgb_rand_res.get('Test Score'),
'R Square':xgb_rand_res.get('R Square'),
'R Square Variance Weighted':xgb_rand_res.get('R Square Variance Weighted')}
pd_reg_search_results = pd_reg_search_results.append(df3,ignore_index = True)
params_xgb1 = {
"n_estimators": [100, 150, 200,250,350,400]
}
rf_xgb1 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb1,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb1.fit(xgs_train,ygs_train)
print (rf_xgb1.best_params_)
print (rf_xgb1.best_score_)
params_xgb1 = {
'min_samples_leaf': [5,6,7]
}
rf_xgb2 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb1,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb2.fit(xgs_train,ygs_train)
print (rf_xgb2.best_params_)
print (rf_xgb2.best_score_)
params_xgb3 = {
'max_features': [7,8,9,10,11,12]
}
rf_xgb3 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb3,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb3.fit(xgs_train,ygs_train)
print (rf_xgb3.best_params_)
print (rf_xgb3.best_score_)
params_xgb4 = {
'max_depth': [5,6,8,10]
}
rf_xgb4 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb4,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb4.fit(xgs_train,ygs_train)
print (rf_xgb4.best_params_)
print (rf_xgb4.best_score_)
params_xgb5 = {
'gamma': [0.1, 0.5, 1, 1.5, 2, 2.5,5]
}
rf_xgb5 = RandomizedSearchCV(estimator = xgb, param_distributions = params_xgb5,n_iter = 6, cv = 5, verbose=1, random_state=35, n_jobs = 5)
rf_xgb5.fit(xgs_train,ygs_train)
print (rf_xgb5.best_params_)
print (rf_xgb5.best_score_)
#learning_rate=0.025
xgb_tuned_res1 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.05, n_estimators=200,silent=False, nthread=0,
min_samples_leaf = 3, max_features = 7, max_depth = 6, gamma = 1 ,name = 'XGB Tuned 1')
df5 = {'Method':xgb_tuned_res1.get('Method') ,
'Name':'XG Boost (estimators:200, Learning Rate:0.05, max_features:7,min_samples_leaf:3)',
'Train Score (%)':xgb_tuned_res1.get('Train Square'),
'Test Score (%)':xgb_tuned_res1.get('Test Score'),
'R Square':xgb_tuned_res1.get('R Square'),
}
pd_reg_search_results = pd_reg_search_results.append(df5,ignore_index = True)
xgb_tuned_res2 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
min_samples_leaf = 5, max_features = 7 , max_depth = 6, gamma = 2, name = 'XGB Tuned 2')
df6 = {'Method':xgb_tuned_res2.get('Method') ,
'Name':'XG Boost (estimators:200, Learning Rate:0.02, max_features:7,min_samples_leaf:5)',
'Train Score (%)':xgb_tuned_res2.get('Train Square'),
'Test Score (%)':xgb_tuned_res2.get('Test Score'),
'R Square':xgb_tuned_res2.get('R Square'),
}
pd_reg_search_results = pd_reg_search_results.append(df6,ignore_index = True)
Lets try with reduced feature with 2 columns removed
xgb_tuned_res3 = calc_xgb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
min_samples_leaf = 5, max_features = 11, max_depth = 6, gamma = 2, name = 'XGB Tuned 3')
df7 = {'Method':xgb_tuned_res3.get('Method') ,
'Name':'Reduced Columns XG Boost (estimators:200, Learning Rate:0.02, max_features:11,min_samples_leaf:5)',
'Train Score (%)':xgb_tuned_res3.get('Train Square'),
'Test Score (%)':xgb_tuned_res3.get('Test Score'),
'R Square':xgb_tuned_res3.get('R Square'),
}
pd_reg_search_results = pd_reg_search_results.append(df7,ignore_index = True)
xgb_tuned_res4 = calc_xgb(xgs_train2, xgs_test2, ygs_train2, ygs_test2,learning_rate=0.03, n_estimators=200,silent=False, nthread=0,
min_samples_leaf = 5, max_features = 7, max_depth = 6, gamma = 2, name = 'XGB Tuned 4')
df8 = {'Method':xgb_tuned_res3.get('Method') ,
'Name':'Reduced Columns XG Boost (estimators:200, Learning Rate:0.03, max_features:7,min_samples_leaf:5)',
'Train Score (%)':xgb_tuned_res4.get('Train Square'),
'Test Score (%)':xgb_tuned_res4.get('Test Score'),
'R Square':xgb_tuned_res4.get('R Square')
}
pd_reg_search_results = pd_reg_search_results.append(df8,ignore_index = True)
xgb_tuned_res5 = calc_xgb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.05, n_estimators=100,silent=False, nthread=0,
min_samples_leaf = 6, max_features = 8, max_depth = 6, gamma = 0.1 ,name = 'XGB Tuned 1')
xgb_tuned_res5 = calc_xgb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.025, n_estimators=100,silent=False, nthread=0,
min_samples_leaf = 6, max_features = 8, max_depth = 6, gamma = 0.1 ,name = 'XGB Tuned 1')
pd_reg_search_results.sort_values('Test Score (%)',ascending=False)
Combining the datasets for visualization with loop as tuples.
datasets_for_visualization = [(rf_result_random ,ygs_test),
(rf_res_reduced_train, ygs_test1),
(xgb_rand_res, ygs_test),
(xgb_tuned_res1, ygs_test),
(xgb_tuned_res2, ygs_test),
(xgb_tuned_res3 , ygs_test1)
]
for res_dict, y_actual in datasets_for_visualization:
plt.figure(figsize=(14,8))
sns.lineplot(x = range(len(y_actual)),y = y_actual,color='yellow',linewidth=1.5)
sns.lineplot(x = range(len(res_dict.get('Y Pred'))),y = res_dict.get('Y Pred'),color='black',linewidth=.5)
plt.title('Actual and Predicted for {0}'.format(res_dict.get('Method')), fontsize=20)
plt.xlabel('Index', fontsize=10) # X-label
plt.ylabel('Values', fontsize=10)
GBR_test=GradientBoostingRegressor(random_state=22)
param_grid1 = {'n_estimators': [50,400,500, 900]}
grid_search1 = GridSearchCV(estimator = GBR_test, param_grid = param_grid1, cv = 3, n_jobs = 2, verbose = 1)
grid_search1.fit(xgs_train,ygs_train)
print (grid_search1.best_params_)
print (grid_search1.best_score_)
param_grid2 = {'n_estimators': [ 900, 1200, 1300, 1400]}
grid_search2 = GridSearchCV(estimator = GBR_test, param_grid = param_grid2, cv = 3, n_jobs = 2, verbose = 1)
grid_search2.fit(xgs_train,ygs_train)
print (grid_search2.best_params_)
print (grid_search2.best_score_)
param_grid3 = {
'learning_rate': [0.1,0.25],
'max_depth': [5,5],
'min_samples_leaf': [5,8],
'min_samples_split': [40,50],
'n_estimators': [200, 500, 800, 1200],
}
GBR_test=GradientBoostingRegressor(random_state=22)
grid_search3 = GridSearchCV(estimator = GBR_test, param_grid = param_grid3,
cv = 5, n_jobs = 3, verbose = 1)
grid_search3.fit(xgs_train,ygs_train)
print (grid_search3.best_params_)
print (grid_search3.best_score_)
def calc_gb(x_train, x_test, y_train, y_test,learning_rate=0, n_estimators=0,silent=False, nthread=0,
min_samples_leaf = 0, max_depth = 0, name = ''):
xgb = XGBRegressor(learning_rate=learning_rate, n_estimators=n_estimators,silent=silent, nthread=nthread,min_samples_leaf = min_samples_leaf,
max_depth = max_depth)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
train_score = xgb.score(x_train, y_train) * 100
test_score = xgb.score(x_test, y_test) * 100
rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
rmse_val = np.sqrt(rmse)
mae =mean_absolute_error(y_test, y_pred_xgb)
r2_score_val = r2_score(y_test, y_pred_xgb)
r2_score_variance_weighted = r2_score(y_test, y_pred_xgb, multioutput='variance_weighted')
print('Train Score: {0}%'.format(train_score))
print('Test Score: {0}%'.format( test_score))
print('RMSE:{0}'.format(rmse_val))
print('MAE:{0}'.format(mae))
print('R Square : {0}'.format(r2_score_val)* 100)
print('R Square Variance Weighted :{0}'.format(r2_score_variance_weighted)*100)
return {'Method':name,'Train Square':train_score,'Test Score':test_score,'RMSE':rmse_val,
'MAE':mae,'R Square':r2_score_val,'R Square Variance Weighted':r2_score_variance_weighted,
'Y Pred':y_pred_xgb}
gb_tuned_res1 = calc_gb(xgs_train, xgs_test, ygs_train, ygs_test,learning_rate=0.001, n_estimators=1200,silent=False, nthread=0,
min_samples_leaf = 5, max_depth = 7, name = 'GB Tuned 1')
gb_tuned_res1 = calc_gb(xgs_train1, xgs_test1, ygs_train1, ygs_test1,learning_rate=0.1, n_estimators=1200,silent=False, nthread=0,
min_samples_leaf = 5, max_depth = 5, name = 'GB Tuned 2')
gb_tuned_res2 = calc_gb(xgs_train2, xgs_test2, ygs_train2, ygs_test2,learning_rate=0.1, n_estimators=1200,silent=False, nthread=0,
min_samples_leaf = 5, max_depth = 5, name = 'GB Tuned 3')
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 40
seed = 1001
xgb = XGBRegressor(learning_rate=0.02, n_estimators=200,silent=False, nthread=0,
min_samples_leaf = 5, max_features = 11, max_depth = 6, gamma = 2)
kfold = KFold(n_splits=num_folds)
results = cross_val_score(xgb, xgs, ygs, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
from matplotlib import pyplot
# plot scores
pyplot.hist(results)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(results, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(results, p))
print('confidence interval of {0}% occurs between {1} and {2}' .format (alpha*100, lower*100, upper*100))
linear,KNN models
XGB Regressor:
xgb11 = XGBRegressor(learning_rate=0.05, n_estimators=200,silent=False, nthread=0,min_samples_leaf = 3,
max_features = 7, max_depth = 6, gamma = 1)
xgb11.fit(xgs_train, ygs_train)
#New version compatibility of model with the older version
xgb11.save_model('xgb_model22.bin')
xgs_train.shape, xgs_train.columns
xgb11.predict(xgs_test)
RF Regressor:
rfg22 = RandomForestRegressor(n_estimators=83, max_depth=15, max_features=8,min_samples_leaf=4)
#(estimators:83, max_depth:15, max_features:8,min_samples_leaf:4)
rfg22.fit(xgs_train2, ygs_train2)
import pickle
with open('rfg_model22.pkl','wb') as k:
pickle.dump(rfg22,k)
xgs_train2.shape, xgs_train2.columns